home *** CD-ROM | disk | FTP | other *** search
/ Amiga Plus 2002 #11 / Amiga Plus CD - 2002 - No. 11.iso / Online / GetAllHtml / GetAllHTML.rexx < prev   
OS/2 REXX Batch file  |  2002-10-28  |  22KB  |  608 lines

  1. /* GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE,DEPTH=/N/K,PORT="/K",BASEURL=/K,BROKENLINKS/S
  2.   v1.00 (22-08-02)  Copyright 1998-2002 Chris S Handley
  3.   (email: cshandley@iee.org)
  4.  
  5.   If you alter & distribute this, please mention me as the original author!
  6.  
  7.   Do not hold your breath for the E version as I have so little spare time.
  8.  
  9.   See GetAllHTML.doc for more details
  10. */
  11.  
  12. OPTIONS RESULTS
  13. Call Addlib('rexxsupport.library',0,-30,0)
  14.  
  15. Say 'GetAllHTML v1.00  Copyright 1998-2002 Chris Handley (read program file for details)'
  16.  
  17.  /* set-up */
  18. HTTPResume='Programs:Utils/Comms/HTTPResume'
  19.  
  20. ExtDir='T:'
  21. TempFile='T:GetAllHTML'
  22. TempFileAdd = Random(1,999,Time(s))
  23. DO UNTIL ~Exists(TempFile||TempFileAdd)
  24.     TempFileAdd = Random(1,999,Time(s))
  25. END
  26. TempFile=TempFile||TempFileAdd
  27.  
  28.  /* deal with args */
  29. Parse VALUE Arg(1) WITH '"' MainURL '"' . '"' DestDir '"' Switch1 Switch2 Switch3 Switch4 Switch5 Switch6 Switch7 Switch8 Switch9 Switch10
  30. IF (MainURL='')|(DestDir='') THEN DO
  31.     Say 'ERROR:  Empty argument(s)!'
  32.         Say 'Usage:  GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE/S,DEPTH=/N/K,NOBASEINDEX/S,PORT=/K,BASEURL=/K,BROKENLINKS/S'
  33.     Say 'Note - both URL & DestDir *must* be enclosed in "double quotes".'
  34.     Say '     - after DEPTH should be a "=" followed by a number with NO spaces between them.'
  35.     Say '     - after PORT should be a "=" followed by a string with NO spaces between them.'
  36.     Say '     - after BASEURL should be a "=" followed by a string with NO spaces between them.'
  37.     Exit 20
  38. END
  39. IF (Right(DestDir,1)~='/')&(Right(DestDir,1)~=':') THEN DestDir=DestDir||'/'
  40. CALL MakeDir(DestDir)
  41. IF Left(MainURL,7)~='http://' THEN MainURL='http://'||MainURL
  42. Switch1=Upper(Switch1); Switch2=Upper(Switch2); Switch3=Upper(Switch3); Switch4=Upper(Switch4); Switch5=Upper(Switch5); Switch6=Upper(Switch6); Switch7=Upper(Switch7); Switch8=Upper(Switch8); Switch9=Upper(Switch9); Switch10=Upper(Switch10); Switch11=Upper(Switch11)
  43.  
  44. SwNoAsk=0; SwArc=0; SwPic=0; SwResume=0; SwDepth=30; SwNoPause=1; SwPort=0; SwTerse=0; BaseURLDir=''; SwBroken=0; SwNoBaseIndex=0
  45.  
  46. IF (Switch1='NOASK')|(Switch2='NOASK')|(Switch3='NOASK')|(Switch4='NOASK')|(Switch5='NOASK')|(Switch6='NOASK')|(Switch7='NOASK')|(Switch8='NOASK')|(Switch9='NOASK')|(Switch10='NOASK')|(Switch11='NOASK') THEN SwNoAsk=1
  47. IF (Switch1='ARC')|(Switch2='ARC')|(Switch3='ARC')|(Switch4='ARC')|(Switch5='ARC')|(Switch6='ARC')|(Switch7='ARC')|(Switch8='ARC')|(Switch9='ARC')|(Switch10='ARC')|(Switch11='ARC') THEN SwArc=1
  48. IF (Switch1='PIC')|(Switch2='PIC')|(Switch3='PIC')|(Switch4='PIC')|(Switch5='PIC')|(Switch6='PIC')|(Switch7='PIC')|(Switch8='PIC')|(Switch9='PIC')|(Switch10='PIC')|(Switch11='PIC') THEN SwPic=1
  49. IF (Switch1='RESUME')|(Switch2='RESUME')|(Switch3='RESUME')|(Switch4='RESUME')|(Switch5='RESUME')|(Switch6='RESUME')|(Switch7='RESUME')|(Switch8='RESUME')|(Switch9='RESUME')|(Switch10='RESUME')|(Switch11='RESUME') THEN SwResume=1
  50. IF (Switch1='PAUSE')|(Switch2='PAUSE')|(Switch3='PAUSE')|(Switch4='PAUSE')|(Switch5='PAUSE')|(Switch6='PAUSE')|(Switch7='PAUSE')|(Switch8='PAUSE')|(Switch9='PAUSE')|(Switch10='PAUSE')|(Switch11='PAUSE') THEN SwNoPause=0
  51. IF (Switch1='TERSE')|(Switch2='TERSE')|(Switch3='TERSE')|(Switch4='TERSE')|(Switch5='TERSE')|(Switch6='TERSE')|(Switch7='TERSE')|(Switch8='TERSE')|(Switch9='TERSE')|(Switch10='TERSE')|(Switch11='TERSE') THEN SwTerse=1
  52. IF (Switch1='BROKENLINKS')|(Switch2='BROKENLINKS')|(Switch3='BROKENLINKS')|(Switch4='BROKENLINKS')|(Switch5='BROKENLINKS')|(Switch6='BROKENLINKS')|(Switch7='BROKENLINKS')|(Switch8='BROKENLINKS')|(Switch9='BROKENLINKS')|(Switch10='BROKENLINKS')|(Switch11='BROKENLINKS') THEN SwBroken=1
  53. IF (Switch1='NOBASEINDEX')|(Switch2='NOBASEINDEX')|(Switch3='NOBASEINDEX')|(Switch4='NOBASEINDEX')|(Switch5='NOBASEINDEX')|(Switch6='NOBASEINDEX')|(Switch7='NOBASEINDEX')|(Switch8='NOBASEINDEX')|(Switch9='NOBASEINDEX')|(Switch10='NOBASEINDEX')|(Switch11='NOBASEINDEX') THEN SwNoBaseIndex=1
  54. IF (Left(Switch1,5)='DEPTH')|(Left(Switch2,5)='DEPTH')|(Left(Switch3,5)='DEPTH')|(Left(Switch4,5)='DEPTH')|(Left(Switch5,5)='DEPTH')|(Left(Switch6,5)='DEPTH')|(Left(Switch7,5)='DEPTH')|(Left(Switch8,5)='DEPTH')|(Left(Switch9,5)='DEPTH')|(Left(Switch10,5)='DEPTH')|(Left(Switch11,5)='DEPTH') THEN DO
  55.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'DEPTH=' Depth .
  56.     IF Depth='' THEN DO
  57.         Say 'No DEPTH number found (must use "DEPTH=x" where x is your number).'
  58.         Say 'Search pages up to a depth of: '
  59.         Pull Depth
  60.     END
  61.  
  62.     IF Depth>42 THEN Depth=42    /* sanity protect against ARexx limitation */
  63.     IF Depth<10 THEN
  64.         SwDepth=Depth*2    /* since each grows by 2 each depth (e.g.".2.3.4.5") */
  65.     ELSE
  66.         SwDepth=((Depth-9)*3)+(9*2) /* as above but above 9 grows by 3 (e.g.".12.13.14.15") */
  67.     SwDepth=SwDepth+5-2    /* 5 = length of "Root." */
  68. END
  69. IF (Left(Switch1,4)='PORT')|(Left(Switch2,4)='PORT')|(Left(Switch3,4)='PORT')|(Left(Switch4,4)='PORT')|(Left(Switch5,4)='PORT')|(Left(Switch6,4)='PORT')|(Left(Switch7,4)='PORT')|(Left(Switch8,4)='PORT')|(Left(Switch9,4)='PORT')|(Left(Switch10,4)='PORT')|(Left(Switch11,4)='PORT') THEN DO
  70.     SwPort=1
  71.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'PORT=' Port .
  72.     IF Port='' THEN DO
  73.         Port=Address()
  74.         IF Left(Port,11)~='HTTPRESUME.' THEN DO
  75.             Say 'ERROR:  PORT argument was not followed by a = and a string with no spaces between (eg."PORT=HTTPResume.1"), and the host enviroment was not already HTTPResume!'
  76.             Exit 20
  77.         END
  78.     END
  79.  END
  80. ELSE Port='' /*probably not necessary*/
  81. IF (Left(Switch1,7)='BASEURL')|(Left(Switch2,7)='BASEURL')|(Left(Switch3,7)='BASEURL')|(Left(Switch4,7)='BASEURL')|(Left(Switch5,7)='BASEURL')|(Left(Switch6,7)='BASEURL')|(Left(Switch7,7)='BASEURL')|(Left(Switch8,7)='BASEURL')|(Left(Switch9,7)='BASEURL')|(Left(Switch10,7)='BASEURL')|(Left(Switch11,7)='BASEURL') THEN DO
  82.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'BASEURL=' BaseURLDir .
  83.     IF BaseURLDir='' THEN DO
  84.         Say 'ERROR:  BASEURL argument was not followed by a = and a string with no spaces between (eg."BASEURL=www.amiga.com")!'
  85.         Exit 20
  86.     END
  87.     BaseURLDir=SubStr(Arg(1),Index(Upper(Arg(1)),BaseURLDir),Length(BaseURLDir))
  88.     IF Right(BaseURLDir,1)~='/' THEN BaseURLDir=BaseURLDir||'/'
  89.  END
  90. ELSE DO
  91.     Parse VALUE Reverse(MainURL) WITH . '/' BaseURLDir
  92.     IF Length(BaseURLDir)<8 THEN BaseURLDir = Reverse(MainURL)     /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  93.     BaseURLDir=Reverse(BaseURLDir)||'/'
  94. END
  95.  
  96. If SwResume=1 THEN Say 'NOTE:  Resume mode activated!'
  97. If SwBroken=1 THEN Say 'NOTE:  Broken-link detection mode activated!'
  98.  
  99. IF Port='' THEN DO
  100.      /* run HTTPResume & set-up related stuff; OVERWRITE cause problems (restart from scratch if fails in the middle) */
  101.     Address Command 'Run >Nil: '||HTTPResume||' GUI NODATECHECK AUTORESUME STARTICONIFIED QUICKQUIT NOERRREQ RXPORTFILE='||TempFile /*NOENV removed*/
  102.     Say 'Waiting for HTTPResume...'
  103.     DO UNTIL Exists(TempFile)
  104.         Delay(25)
  105.     END
  106.     Delay(100)
  107.     IF ~Open(.port, TempFile, 'READ') THEN DO
  108.         Say 'ERROR:  Could not open "'||TempFile||'"!'
  109.         Exit 20
  110.     END
  111.     Port=ReadLn(.port)
  112.     Call Close(.port)
  113.     Call Delete(TempFile)
  114.     IF Port='***' THEN DO
  115.         Say 'ERROR:  HTTPResume could not open it''s ARexx port!'
  116.         Exit 20
  117.     END
  118.     Address(Port)
  119.  END
  120. ELSE DO
  121.     Address(Port)
  122. /*    SET OVERWRITE*/
  123.     SET NODATECHECK
  124.     SET AUTORESUME
  125.     SET QUICKQUIT
  126.     SET NOERRREQ
  127. END
  128.  
  129.  /* init set-up */
  130. Root.0=1
  131. Root.1=MainURL
  132. Root.1.HTML=1
  133.  
  134. ModemOnLine=0
  135. LastSuffix=''    /*record of suffix of last user confirmed file download - so semi-intelligent! */
  136.  
  137.  /* get all pages recurcively */
  138. Say 'Downloading & scanning pages...'
  139. CALL DownloadList('Root.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  140.  
  141. Say 'Finished.'
  142. IF SwPort=0 THEN QUIT
  143. Exit
  144.  
  145. DownloadList: PROCEDURE EXPOSE Root. Resume. ModemOnLine LastSuffix ExtDir
  146.     /* DownloadList(URLList,DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex) */
  147.      /* grab args */
  148.     URLList=Arg(1)
  149.     DestDir=Arg(2)
  150.     BaseURLDir=Arg(3)
  151.     SwNoAsk=Arg(4)
  152.     SwArc=Arg(5)
  153.     SwPic=Arg(6)
  154.     SwResume=Arg(7)
  155.     SwDepth=Arg(8)
  156.     SwNoPause=Arg(9)
  157.     SwTerse=Arg(10)
  158.     SwBroken=Arg(11)
  159.     SwNoBaseIndex=Arg(12)
  160.  
  161.     INTERPRET 'URLListSize='||URLList||'0'
  162.  
  163.      /* deal with each URL in list in turn */
  164.     IF Length(URLList)>SwDepth THEN
  165.         NOP
  166.      ELSE DO
  167.         IF URLListSize>0 THEN DO
  168. /*Say '-Length('URLList')='||Length(URLList)*/
  169.             DO i=1 TO URLListSize
  170.                 NewURLList=URLList||i
  171.                 INTERPRET 'URL='||NewURLList
  172.                 INTERPRET 'HTMLfile='||NewURLList||'.HTML'
  173.  
  174.                 INTERPRET 'ExternalLink='||NewURLList||'.EXT'
  175.                 IF ExternalLink~=1 THEN ExternalLink=0
  176.  
  177.                  /* decide on relative file & path */
  178.                 IF ExternalLink=0 THEN
  179.                      /* find local path */
  180.                     Parse VAR URL (BaseURLDir) PathFile
  181.                 ELSE DO
  182.                      /* outside normal search (external) - set PathFile as just file */
  183.                     Parse VALUE Reverse(URL) WITH PathFile '/' .
  184.                     PathFile=Reverse(PathFile)
  185.                 END
  186.                 IF (Right(PathFile,1)='/')|(PathFile='') THEN DO
  187.                     PathFile=PathFile||'InDeX.hTmL'    /* give filename-less pages a name */
  188.                     HTMLfile=1            /* force attempted scanning for HTMLs */
  189.                     GuessedURL=1
  190.                  END
  191.                 ELSE
  192.                     GuessedURL=0
  193.                 Parse VALUE Reverse(PathFile) WITH File '/' Path
  194.                 File=Reverse(File)
  195.                 Path=Reverse(Path)
  196.                 IF Path='' THEN DO
  197.                     File=PathFile
  198.                     Path=''
  199.                 END
  200.  
  201.                  /* create necessary dir(s) */
  202.                 PathLeft=Path                    /* use URL minus file at end */
  203.                 CurPath=DestDir
  204.                 DO While PathLeft~=''
  205.                     Parse VALUE PathLeft WITH NewDir '/' PathLeft
  206.                     IF NewDir~=='' THEN DO
  207.                         CurPath=CurPath||NewDir||'/'
  208.                         CALL MakeDir(Left(CurPath,Length(CurPath)-1))
  209.                      END
  210.                     ELSE DO
  211.                         IF SwTerse=0 THEN DO
  212.                             IF SwNoPause=0 THEN DO
  213.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'" (press <return>)'
  214.                                 Pull Input
  215.                              END
  216.                             ELSE
  217.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'"'
  218.                         END
  219.                     END
  220.                 END
  221.                 IF ExternalLink=0 THEN
  222.                     DownloadFile=DestDir||PathFile
  223.                 ELSE
  224.                     DownloadFile=ExtDir||PathFile
  225.  
  226.  
  227.                 IF SwResume~=0 THEN DO
  228.                     SeenBefore=0
  229.                     /*RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')*/
  230.                     RxDownloadFile=DownloadFile
  231.                     IF Resume.RxDownloadFile=1 THEN SeenBefore=1
  232.  
  233.                     IF SeenBefore=0 THEN DO            /* if visited this page before then pass! */
  234.                         IF Exists(DownloadFile) THEN DO
  235.                             Resume.RxDownloadFile=1
  236.  
  237.                             IF HTMLfile=1 THEN DO
  238.                                  /* parse page for URLs into a list */
  239.                                 CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  240.                                          /* download pages from list */
  241.                                 CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  242.                             END
  243.                          END
  244.                         ELSE DO
  245.                             IF ExternalLink=0 THEN DO
  246.                                 SwResume=0    /* reached point did last time, now continue as before */
  247.                                 IF ModemOnLine=0 THEN DO    /* hack to ensure only halt for input once (so can leave alone) */
  248.                                     Say 'NOTE:  Reached point where left off! (press <return>)'
  249.                                     Pull Input
  250.                                 END
  251.                                 ModemOnLine=1
  252.                             END
  253.                         END
  254.  
  255.                     END
  256.                 END
  257.  
  258.                 IF (SwResume=0)|(ExternalLink=1) THEN DO
  259.                     IF ~Exists(DownloadFile) THEN DO    /* if visited this page before then pass! */
  260.                         IF ExternalLink=0 THEN DO
  261.                             /* download file */
  262.                             CALL GetHTML(URL,DownloadFile)
  263.  
  264.                             /* see if was downloaded */
  265.                             IF ~Exists(DownloadFile) THEN DO
  266.                                 IF SwTerse=0 THEN DO
  267.                                     IF GuessedURL~=1 THEN DO
  268.                                         IF SwNoPause=0 THEN DO
  269.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'" (press <return>)'
  270.                                             Pull Input
  271.                                          END
  272.                                         ELSE
  273.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'"'
  274.                                     END
  275.                                 END
  276.  
  277.                                 /* if not downloaded then place empty 'fake' file to stop RESUME stopping to early */
  278.                                 Call Open(.file, DownloadFile, 'WRITE')
  279.                                     Call Close(.file)
  280.                              END
  281.                             ELSE DO
  282.                                 /* scan downloaded file if asked to */
  283.                                 IF HTMLfile=1 THEN DO
  284.                                      /* parse page for URLs into a list */
  285.                                     CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  286.                                              /* download pages from list */
  287.                                     CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex)
  288.                                 END
  289.                             END
  290.                          END
  291.                         ELSE DO
  292.                             /* download file, only if haven't done before (hijacked how RESUME checks) */
  293.                             /*RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')*/
  294.                             RxDownloadFile=DownloadFile
  295.                             IF Resume.RxDownloadFile~=1 THEN DO
  296.                                 Resume.RxDownloadFile=1
  297.  
  298.                                 /* download file */
  299.                                 CALL GetHTML(URL,DownloadFile)
  300.  
  301.                                 /* re-try downloading twice, incase 'freak' connect failure */
  302.                                 IF ~Exists(DownloadFile) THEN DO
  303.                                     CALL Delay(50)
  304.                                     CALL GetHTML(URL,DownloadFile)
  305.  
  306.                                     IF ~Exists(DownloadFile) THEN DO
  307.                                         CALL Delay(50)
  308.                                         CALL GetHTML(URL,DownloadFile)
  309.                                     END
  310.                                 END
  311.  
  312.                                 IF ~Exists(DownloadFile) THEN DO
  313.                                     Parse VALUE Reverse(URLList) WITH . '.' BrokePage
  314.                                     INTERPRET 'BrokePage='||Reverse(BrokePage)
  315.                                     Say 'Found BROKEN LINK to "'||URL||'" in "'||BrokePage||'"'
  316.                                  END
  317.                                 ELSE
  318.                                     CALL Delete(DownloadFile)
  319.                             END
  320.                         END
  321.                     END
  322.                 END
  323.             END
  324.         END
  325.     END
  326. Return
  327.  
  328. GetURLs: PROCEDURE EXPOSE Root. LastSuffix
  329.     /* GetURL(URLList,DownloadFile,BaseURLDir,FileURL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken,SwNoBaseIndex) */
  330.      /* get args */
  331.     URLList=Arg(1)
  332.     DownloadFile=Arg(2)
  333.     BaseURLDir=Arg(3)
  334.     FileURL=Arg(4)
  335.     SwNoAsk=Arg(5)
  336.     SwArc=Arg(6)
  337.     SwPic=Arg(7)
  338.     SwNoPause=Arg(8)
  339.     SwTerse=Arg(9)
  340.     SwBroken=Arg(10)
  341.     SwNoBaseIndex=Arg(11)
  342.  
  343.     INTERPRET URLList||'0 = 0'
  344.  
  345.      /* expand path to global, if is local reference like "/new/0083.html" */
  346.     Parse VALUE Reverse(FileURL) WITH . '/' LocalURLDir
  347.     IF Length(LocalURLDir)<8 THEN LocalURLDir = Reverse(FileURL)    /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  348.     LocalURLDir=Reverse(LocalURLDir)||'/'
  349.  
  350.     PARSE VAR LocalURLDir 'http://' LocalURLDomain '/' .    /*recover domain from URL*/
  351.     LocalURLDomain='http://'||LocalURLDomain
  352.  
  353.      /* parse (possibly) downloaded HTML file for URLs */
  354.     IF Open(.file, DownloadFile, 'READ') THEN DO
  355. /*Say 'Reading HTML file "'||DownloadFile||'"'*/
  356.         DO WHILE ~EOF(.file)
  357.              /* search for HTML ref. links */
  358.             Line=ReadLn(.file)
  359.             ULine=Upper(Line)
  360.  
  361.             NewPos=0; Mode=0
  362.             DO UNTIL NewPos<0
  363.                  /* non-frame search */
  364.                 IF Mode=0 THEN DO
  365.                     NewPos=Pos('HREF=',ULine,NewPos+1)    /*finds "<AREA HREF" "<A HREF" "<A/nHREF" ...*/
  366.                     IF NewPos=0 THEN DO
  367.                         Mode=1
  368.                         NewPos=0
  369.                     END
  370.                 END
  371.                  /* frame/image search */
  372.                 IF Mode=1 THEN DO
  373.                     Done=1
  374.                      /* "SRC=" occurs for both in frames & images */
  375.                     NewPos=Pos('SRC=',ULine,NewPos+1)
  376.                     IF NewPos=0 THEN NewPos=-1
  377.                 END
  378.  
  379.                  /* expand URL to full path, remove non-file parts & store only if inside parameters */
  380.                 IF NewPos>0 THEN DO
  381.                     Parse VAR Line =NewPos '="' URL '"'
  382.                     IF URL='' THEN Parse VAR Line =NewPos '=\"' URL '"'    /*javascripts precede "s by a slash*/
  383. /*Say '-Found URL "'||URL||'"'*/
  384.                     IF URL~=='' THEN DO
  385.                         Parse UPPER VAR URL URLDev ':' URLRest
  386.                         Download=1
  387.                         IF (URLRest~=='')&(URLDev~='HTTP') THEN DO
  388.                              /* found e.g. "mailto:" */
  389.                             IF SwTerse=0 THEN Say 'Found non-http link "'||URL||'"'
  390.                             Download=0
  391.                         END
  392.                         IF URLDev=Upper(URL) THEN DO
  393.                         /*    IF Left(URL,1)='/' THEN DO
  394.                                 URL=SubStr(URL,2)        /* remove pre-slash */
  395.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  396.                                     URL='/'||SubStr(URL,3)
  397.                                 END
  398.                                 /*???remove pre-slash (again)???*/
  399.                                 URL=LocalURLDomain||URL    /* pre-slash finally replaced by domain name */
  400.                              END
  401.                             ELSE DO
  402.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  403.                                     URL='/'||SubStr(URL,3)
  404.                                 END
  405.                                 IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  406.                                 URL=LocalURLDir||URL    /* local reference -> expand to full */
  407.                             END
  408.                         */
  409.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove pre-slash */
  410.                         /*    DO While Left(URL,2)='..'            /* convert "../" to "//" */
  411.                                 URL='/'||SubStr(URL,3)
  412.                             END
  413.                         */
  414.  
  415.                             /*handle "../"s*/
  416.                             newURL=URL
  417.                             DO UNTIL URL=newURL
  418.                                 URL=newURL
  419.                                 Parse VAR URL newURLpre '../' newURLpost
  420.                                 IF newURLpost~='' THEN newURL=newURLpre||'//'||newURLpost
  421.                             END
  422.                             URL=newURL
  423.  
  424.                             /*handle "./"s*/
  425.                             newURL=URL
  426.                             DO UNTIL URL=newURL
  427.                                 URL=newURL
  428.                                 Parse VAR URL newURLpre './' newURLpost
  429.                                 IF newURLpost~='' THEN newURL=newURLpre||newURLpost
  430.                             END
  431.                             URL=newURL
  432.  
  433.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  434.                             URL=LocalURLDir||URL                /* local reference -> expand to full */
  435.  
  436.                              /* if have double-slashes (go down dir), then remove relevant dirs */
  437.                             Done=0
  438.                             DO Until Done=1            /* an algorithm with a bit of magic! */
  439.                                 URLLen=Length(URL)
  440.                                 EndDPos=Index(URL,'//',8)                /* marks end of '//' */
  441.                                 IF EndDPos>0 THEN DO
  442.                                     StartDPos=Index(Reverse(URL),'/',URLLen-EndDPos+2)
  443.                                     IF StartDPos>0 THEN DO
  444.                                         StartDPos=URLLen-StartDPos+1        /* marks 1st slash before '//' */
  445.                                         URL=Left(URL,StartDPos)||SubStr(URL,EndDPos+2)
  446.                                      END
  447.                                     ELSE
  448.                                         Done=1
  449.                                  END
  450.                                 ELSE
  451.                                     Done=1
  452.                             END
  453.                          END
  454.                         ELSE DO
  455.                             IF URLRest=='' THEN Download=0        /* nothing after ":" */
  456.                         END
  457.  
  458.                         IF Download=1 THEN DO
  459.                              /* remove "#search" from "http:path/file#search" */
  460.                             IF Index(URL,'#')~=0 THEN DO
  461.                                 Parse VALUE Reverse(URL) WITH . '#' URL
  462.                                 URL=Reverse(URL)
  463.                             END
  464.  
  465.                              /* remove "?search" from "http:path/file?search" */
  466.                             IF Index(URL,'?')~=0 THEN DO
  467.                                 Parse VALUE Reverse(URL) WITH . '?' URL
  468.                                 URL=Reverse(URL)
  469.                             END
  470.  
  471.                              /* used to check for suffix & if it is not part of e.g. www.amiga.com */
  472.                             Parse VALUE Reverse(URL) WITH URLFile '/' .
  473.                             Parse VAR URLFile Suffix '.' .
  474.                             URLFile=Reverse(URLFile)
  475.                             Suffix=Reverse(Suffix)
  476.                             DirSuffix=0
  477.                             GotSuffix=0
  478.                             IF Suffix~=URLFile THEN DO
  479.                                 GotSuffix=1
  480.                                 IF Index(Reverse(URL),'/')>(Length(URL)-7) THEN DO
  481.                                     DirSuffix=1
  482.                                     GotSuffix=0
  483.                                 END
  484.                             END
  485.  
  486.                             IF (GotSuffix=0)&(Right(URL,1)~='/')&(Index(URL,'?')=0) THEN
  487.                                 URL2=URL||'/'    /* MAY need to add implicit slash */
  488.                             ELSE
  489.                                 URL2=''
  490.  
  491.                             ExernalLink=0
  492.                             IF Left(URL,Length(BaseURLDir))~==BaseURLDir THEN DO
  493.                                 IF SwBroken=0 THEN
  494.                                     Download=0    /* don't download pages below initial dir */
  495.                                 ELSE
  496.                                     ExternalLink=1    /* do download but no further */
  497.                              END
  498.                             ELSE DO
  499.                                  /* check if URL is BaseURL's index (ie."baseurl/" or "baseurl/index.html") */
  500.                                 IF SwNoBaseIndex=1 THEN DO
  501.                                     AboveBaseURLDir=Upper(Right(URL,Length(URL)-Length(BaseURLDir)))
  502.                                     IF (AboveBaseURLDir='/')|(Left(AboveBaseURLDir,6)='INDEX.') THEN Download=0
  503.                                 END
  504.                             END
  505.  
  506.                              /* check if should download this file-type */
  507.                             HTMLfile=0
  508.                             IF (GotSuffix=1)&&(ExternalLink=0) THEN DO    /*never consider external links*/
  509.                                 Suffix=Upper(Left(Suffix,3,' '))
  510.                                  /* as well as always downloading HTML files, also intelligently downloads if suffix same as last user-confirmed download */
  511.                                 IF (Suffix~='HTM')&(Suffix~='SHT')&(Suffix~='SH ')&(Suffix~=LastSuffix) THEN DO
  512.                                     Ask=1
  513.                                     Arc=0; Pic=0
  514.                                     IF (Suffix='LZX')|(Suffix='LHA')|(Suffix='ZIP')|(Suffix='LZH')|(Suffix='ZOO') THEN Arc=1
  515.                                     IF (Suffix='GIF')|(Suffix='JPG')|(Suffix='JPE')|(Suffix='PNG')|(Suffix='JFI')|(Suffix='SWF') THEN Pic=1
  516.  
  517.                                     IF (Arc=1)&(SwArc=1) THEN Ask=0
  518.                                     IF (Pic=1)&(SwPic=1) THEN Ask=0
  519.  
  520.                                     IF Download=1 THEN DO
  521.                                         IF Ask=1 THEN DO
  522.                                             IF SwNoAsk=1 THEN
  523.                                                 Download=0
  524.                                             ELSE DO
  525.                                                 Say 'QUERY:  Download file "'||URL||'"?'
  526.                                                 DO Until Input~=''
  527.                                                     Pull Input
  528.                                                 END
  529.                                                 IF Left(Input,1)='N' THEN
  530.                                                     Download=0
  531.                                                 ELSE
  532.                                                     LastSuffix=Suffix
  533.                                             END
  534.                                         END
  535.                                     END
  536.                                   END
  537.                                 ELSE
  538.                                     HTMLfile=1
  539.                             END
  540.  
  541.                             IF FileURL=URL  THEN Download=0        /* avoid self-referencing infinite loops */
  542. /*IF Download=1 THEN DO
  543. IF URL2='' THEN
  544.     Say '--Final URL="'||URL||'", Download='||Download
  545. ELSE
  546.     Say '--Final URL="'||URL||'"(/), Download='||Download
  547. END*/
  548.  
  549.                             IF Download=1 THEN DO
  550.                                  /* store URL in list */
  551.                                 URL=Strip(URL,'T')
  552.                                 INTERPRET 'URLListSize='||URLList||'0 + 1'
  553.                                 INTERPRET URLList||'0 = URLListSize'
  554.                                 INTERPRET URLList||URLListSize||' = URL'
  555.  
  556.                                 INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  557.                                 INTERPRET URLList||URLListSize||'.EXT = ExternalLink'    /* record is external link */
  558.  
  559.                                  /* add 2nd possible interpretation of URL to list*/
  560.                                 IF URL2~='' THEN DO
  561.                                     URL2=Strip(URL2,'T')
  562.                                     INTERPRET 'URLListSize='||URLList||'0 + 1'
  563.                                     INTERPRET URLList||'0 = URLListSize'
  564.                                     INTERPRET URLList||URLListSize||' = URL2'
  565.  
  566.                                     INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  567.                                     IF ExternalLink=1 THEN INTERPRET URLList||URLListSize||'.EXT = 1' /* record is external link */
  568.                                 END
  569.                             END
  570.                         END
  571.                      END
  572. /*                    ELSE DO
  573.                         IF SwTerse=0 THEN DO
  574.                             IF SwNoPause=0 THEN DO
  575.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'" (press <return>)'
  576.                                 Pull Input
  577.                              END
  578.                             ELSE
  579.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'"'
  580.                         END
  581.  
  582.                     END
  583. */
  584.                 END
  585.             END
  586.         END
  587.         CALL Close(.file)
  588.     END
  589. Return
  590.  
  591. GetHTML: PROCEDURE
  592.     /* GetHTML(TheURL,File) */
  593.      /* grab args */
  594.     TheURL=Arg(1)
  595.     File=Arg(2)
  596.  
  597.      /* download file */
  598.     SET OUTFILE File
  599.     SET URL TheURL
  600.     START
  601.     Working=1
  602.     DO WHILE Working>0
  603.         CALL Delay(50)
  604.         QUERY FINISHED
  605.         Working=Result
  606.     END
  607. Return
  608.